gc()
rm(list = ls())
set.seed(12345)
require(tidyverse)
require(readstata13)
require(readxl)
require(naniar)
require(tidyr)
require(countrycode)
require(data.table)
require(haven)
require(stringr)
require(dplyr)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) 
# Note: if you are not using R Studio this command will not work, 
# set WD to source file location manually

################################################################################
### World Bank WDI 
################################################################################

# Import full WDI data CSV from World Bank

df_wdidata_raw <- read.csv("../data/xnat/wdi/WDIData-2019.csv")

# Reshaping to long

df_wdidata_raw_long <- df_wdidata_raw %>%
  gather(year, 
         value, 
         -Country.Code, 
         -Indicator.Name, 
         -Indicator.Code, 
         -Country.Name)

# Dropping indicator name 
df_wdidata_raw_long$Indicator.Name <- NULL

# Changing year variable to numeric

df_wdidata_raw_long$year <- as.numeric(substr(df_wdidata_raw_long$year, 2, 5))


# Reshaping back to wide

df_wdidata_raw_wide <- df_wdidata_raw_long %>% 
  spread(Indicator.Code, 
         value)

# Renaming variables for clarity

df_wdidata_raw_rename <- df_wdidata_raw_wide %>%
  rename (cname = Country.Name, 
          wbcoded = Country.Code)

# Creating COW ccode variable
df_wdidata_raw_rename$ccode <- countrycode(df_wdidata_raw_rename$wbcoded, 
                                           "wb", 
                                           "cown")

# Ensuring no country is mistakenly removed
unique(df_wdidata_raw_rename$cname[is.na(df_wdidata_raw_rename$ccode)])


# Revising variable order for ease of reading
df_wdidata_raw_rename_reorder <- df_wdidata_raw_rename %>%
  select(cname, 
         ccode, 
         wbcoded:VC.PKP.TOTL.UN)

# Entering ccode for Serbia (missing from COW)

df_wdidata_raw_rename_reorder$ccode[df_wdidata_raw_rename_reorder$cname == 
                                      "Serbia"] <- 345



df_wdidata_ccodefix <- df_wdidata_raw_rename_reorder %>% 
  mutate(ccode = replace(ccode, 
                         cname == "Serbia", 
                         345)) 

# Dropping aggregates, territories, etc. not in WDI
df_wdidata_clean <- df_wdidata_raw_rename_reorder %>% 
  filter(!is.na(ccode))


#Verifying cname/ccode continuity - 194 countries
isTRUE(length(unique(df_wdidata_clean$cname)) == length(unique(df_wdidata_clean$ccode)))


#Date range (1960-2018)
summary(df_wdidata_clean$year)

#Renaming key variables for clarity
df_wdidata_clean_rename <- df_wdidata_clean %>%
  rename(wdip_trade_pct = NE.TRD.GNFS.ZS, 
         wdip_oda_gni = DT.ODA.ODAT.GN.ZS, 
         wdip_fuel_exports = TX.VAL.FUEL.ZS.UN, 
         wdip_fuel_imports = TM.VAL.FUEL.ZS.UN, 
         wdip_imports_gdp = NE.IMP.GNFS.ZS, 
         wdip_imports_growth = NE.IMP.GNFS.KD.ZG, 
         wdip_exports_gdp = NE.EXP.GNFS.ZS, 
         wdip_exports_growth = NE.EXP.GNFS.KD.ZG, 
         wdip_gdp_growth = NY.GDP.MKTP.KD.ZG, 
         wdip_gdp_current = NY.GDP.MKTP.CD, 
         wdip_gdp_ppc = NY.GDP.PCAP.CD, 
         wdip_gdp_cusd = NY.GDP.MKTP.KD, 
         wdip_gdp_ppp_cusd_i = NY.GDP.MKTP.PP.KD,  
         wdip_fdi_in_gdp = BX.KLT.DINV.WD.GD.ZS, 
         wdip_resource_rents = NY.GDP.TOTL.RT.ZS, 
         wdip_gdp_ppc_cons = NY.GDP.PCAP.KD, 
         wdip_urban_pct = SP.URB.TOTL.IN.ZS, 
         wdip_ag_value_added = NV.AGR.TOTL.ZS, 
         wdip_cgov_debt_gdp = GC.DOD.TOTL.GD.ZS, 
         wdip_pop_old = SP.POP.65UP.TO.ZS, 
         wdip_pop_young  = SP.POP.0014.TO.ZS, 
         wdip_pop_total = SP.POP.TOTL, 
         wdip_labor_unemp = SL.UEM.TOTL.ZS, 
         wdip_labor_unemp_natl = SL.UEM.TOTL.NE.ZS, 
         wdip_inflation = FP.CPI.TOTL.ZG, 
         wdip_ed_exp_gov_pct = SE.XPD.TOTL.GB.ZS, 
         wdip_ed_exp_gdp_pct  = SE.XPD.TOTL.GD.ZS, 
         wdip_health_exp_gdp_pct  = SH.XPD.GHED.GD.ZS, 
         wdip_health_exp_gov_pct  = SH.XPD.GHED.GE.ZS, 
         wdip_health_exp_ppp = SH.XPD.GHED.PP.CD, 
         wdip_health_exp_usd = SH.XPD.GHED.PC.CD, 
         wdip_gov_consump_gdp = NE.CON.GOVT.ZS, 
         wdip_fdi_out_gdp = BM.KLT.DINV.WD.GD.ZS, 
         wdip_military_gdp = MS.MIL.XPND.GD.ZS, 
         wdip_industry_va = NV.IND.TOTL.ZS, 
         wdip_industry_emp = SL.IND.EMPL.ZS, 
         wdip_informal_emp = SL.ISV.IFRM.FE.ZS, 
         wdip_labor_part_rate = SL.TLF.ACTI.ZS, 
         wdip_sanitation = SH.STA.BASS.ZS, 
         wdip_water = SH.H2O.BASW.UR.ZS, 
         wdip_rain = AG.LND.PRCP.MM, 
         wdip_infant_mort = SP.DYN.IMRT.IN, 
         wdip_gross_fixed_cap = NE.GDI.FTOT.ZS, 
         wdip_mine_rents_gdp = NY.GDP.MINR.RT.ZS, 
         wdip_ict_phone_p100 = IT.MLT.MAIN.P2, 
         wdip_mobiles_p1000 = IT.CEL.SETS.P2, 
         wdip_coal_rents_gdp = NY.GDP.COAL.RT.ZS, 
         wdip_forest_rents_gdp = NY.GDP.FRST.RT.ZS, 
         wdip_greenhouse_gas = EN.ATM.GHGT.KT.CE, 
         wdip_age_dependency = SP.POP.DPND) %>% 
  dplyr::filter(!is.na(year))


#Keeping only needed variables

df_wdidata_clean_final <- 
  df_wdidata_clean_rename[, c("cname", 
                              "ccode", 
                              "wbcoded", 
                              "year", 
                              colnames(df_wdidata_clean_rename)
                              [grep("wdip",colnames(df_wdidata_clean_rename))])]

df_wdidata_clean_final$tag <- 1

################################################################################
### Correlates of War State System Membership v2016
################################################################################

#Import Correlates of War State-Year Dataset
df_cow <- read_csv("../data/xnat/cow/states2016.csv")

df_cow_current <- df_cow %>%
  subset(endyear > 1959)

# Assigning years for persisent codes for Yemen, Germany, and Czech Republic to match with WDI
# Our dataset treats:
# 1. Germany as a continuation of West Germany (excluding East Germany)
# 2. Vietnam as a continuation of North Vietnam (excluding South Vietnam)
# 3. Yemen as a continuation of Republic of Yemen (excluding Yemen, P.R.)
# 4. Yugoslavia as modern "Serbia"
# 5. Czech Republic as a continuation of Czechoslovakia
# 6. Russia as a continuation of the USSR

df_cow_current$styear[df_cow_current$statenme == "Yemen"] <- 1926

df_cow_current_ymnfix <- df_cow_current %>%
  filter(statenme != "Yemen People's Republic" & 
           statenme != "Yemen Arab Republic" & 
           statenme != "Republic of Vietnam")


df_cow_current_ymnfix$styear[df_cow_current_ymnfix$statenme == 
                               "Germany"] <- 1955

df_cow_current_ymnfix$styear[df_cow_current_ymnfix$statenme == 
                               "Czech Republic"] <- 1955

#Revising variable names for continuity and keeping only relevant vars

df_cow_varnamesfix <- df_cow_current_ymnfix %>%
  rename(cname = statenme)

df_cow_clean <- df_cow_varnamesfix[, c("styear", 
                                       "cname", 
                                       "ccode")]

################################################################################
### Varieties of Democracy v10
################################################################################

#Import V-dem data

df_vdem <- readRDS("../data/xnat/vdem/V-Dem-CY-Full+Others-v9.rds")

#Renaming to maintain uniformity

df_vdem_rename <- df_vdem %>% 
  rename(cname = country_name)

df_vdem_keepvars <- df_vdem_rename %>% 
  select(cname, 
         year,
         v2x_accountability, 
         v2x_rule, 
         v2x_veracc, 
         v2x_horacc, 
         v2x_diagacc, 
         v2juaccnt, 
         v2x_corr, 
         v2x_pubcorr, 
         v2x_execorr, 
         v2exbribe, 
         v2excrptps, 
         v2lgcrrpt, 
         v2jucorrdc, 
         v2mecorrpt, 
         e_v2x_corr_3C, 
         e_v2x_corr_4C, 
         e_v2x_corr_5C, 
         e_ti_cpi, 
         v2x_polyarchy, 
         v2x_api, 
         v2x_libdem, 
         v2x_partipdem, 
         v2x_delibdem, 
         v2xel_locelec, 
         v2xel_regelec, 
         e_total_oil_income_pc,  
         v2xel_elecpres, 
         v2xel_elecparl, 
         v2x_libdem, 
         v2x_delibdem, 
         v2x_partipdem, 
         v2x_egaldem, 
         v2cltrnslw, 
         e_fh_rol, 
         e_ti_cpi,  
         v2strenadm, 
         v2x_gender, 
         v2xnp_regcorr, 
         v2x_partip, 
         v2x_cspart, 
         v2csprtcpt)

#Keeping years 1960-present
df_vdem_current <- df_vdem_keepvars %>%
  subset(year > 1959)

#Standardizing ccodes to match world bank
df_vdem_current$ccode <- countrycode(df_vdem_current$cname, 
                                           "country.name", 
                                           "cown")

#Revising variable order for ease of reading
df_vdem_current_reorder <- df_vdem_current %>%
  select(cname, 
         ccode, 
         year:v2csprtcpt)


df_vdem_current_reorder$ccode[df_vdem_current_reorder$cname == 
                                "Serbia"] <- 345




df_vdem_countryfix <- df_vdem_current_reorder %>%
  filter(cname != "South Yemen" & 
           cname != "Zanzibar" & 
           cname != "German Democratic Republic" & 
           cname != "Republic of Vietnam")

# Ensuring no country is mistakenly removed
unique(df_vdem_countryfix$cname[is.na(df_vdem_countryfix$ccode)])

# Dropping aggregates, territories, etc. not in WDI
df_vdem_clean <- df_vdem_countryfix %>% 
  filter(!is.na(ccode))

################################################################################
# ICTD UNU-WIDER United Nations University World Institute for 
# Development Economics Research GRD - Government Revenue Dataset
################################################################################

df_ictd_raw <- read.dta13("../data/xnat/ictd/Merged-2019-ICTD.dta")

# Rename vars
df_ictd_raw_working <- df_ictd_raw %>%
  rename(cname = country, 
         wbcode = iso)


# Setting ccode
df_ictd_raw_working$ccode <- countrycode(df_ictd_raw_working$wbcode, 
                                           "wb", 
                                           "cown")

# Ensuring no country is mistakenly removed

unique(df_ictd_raw_working$cname[is.na(df_ictd_raw_working$ccode)])

df_ictd_raw_working$ccode[df_ictd_raw_working$cname ==
                              "Serbia"] <- 345


df_ictd_raw_working$ccode[df_ictd_raw_working$cname == 
                            "Kosovo"] <- 347


df_ictd_raw_working_cou <- df_ictd_raw_working %>% 
  filter(!is.na(ccode))

df_ictd_reorder <- df_ictd_raw_working_cou %>%
  select(cname, 
         ccode, 
         wbcode, 
         year, 
         identifier, 
         general, 
         source, 
         id, 
         reg, 
         inc, 
         rev_inc_sc:socialcontributionsnotes)



# Reassigning names from older versions for easier reference

df_ictd_addvars <- df_ictd_reorder

df_ictd_addvars$gst_rev <- df_ictd_addvars$tax_g_s
df_ictd_addvars$trade <- df_ictd_addvars$tax_trade
df_ictd_addvars$indirect <- df_ictd_addvars$nr_indirect
df_ictd_addvars$corp <- df_ictd_addvars$tax_nr_corp
df_ictd_addvars$indiv <- df_ictd_addvars$tax_indiv
df_ictd_addvars$direct <- df_ictd_addvars$direct_ex_sc_ex_rt
df_ictd_addvars$totnontax <- df_ictd_addvars$nontax
df_ictd_addvars$tottax <- df_ictd_addvars$tax_ex_sc
df_ictd_addvars$totrev <- df_ictd_addvars$rev_ex_gr_ex_sc
df_ictd_addvars$income <- df_ictd_addvars$tax_income


# Certain observations are flagged for quality or accuracy issues
# Replace tax data as missing for those observations which are flagged

df_ictd_taxfix <- df_ictd_addvars
df_ictd_taxfix_vars <- df_ictd_taxfix %>%
  select(rev_inc_sc:grants, 
         gst_rev:income)

tax_vars <- as.data.frame(c(names(df_ictd_taxfix_vars)))

df_ictd_taxfix_final <- df_ictd_taxfix %>% 
  mutate(rev_inc_sc = replace(rev_inc_sc, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(rev_ex_sc = replace(rev_ex_sc, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(rev_ex_gr_inc_sc = replace(rev_ex_gr_inc_sc, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(rev_ex_gr_ex_sc = replace(rev_ex_gr_ex_sc, caution1accuracyqualityorco == 1, NA)) %>%
  mutate(tot_res_rev = replace(tot_res_rev, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tot_nres_rev_inc_sc = replace(tot_nres_rev_inc_sc, caution1accuracyqualityorco == 1, NA)) %>%
  mutate(tax_inc_sc = replace(tax_inc_sc, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_ex_sc = replace(tax_ex_sc, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(resourcetaxes = replace(resourcetaxes, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(nrtax_inc_sc = replace(nrtax_inc_sc, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(nrtax_ex_sc = replace(nrtax_ex_sc, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(direct_inc_sc_inc_rt = replace(direct_inc_sc_inc_rt, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(direct_inc_sc_ex_rt = replace(direct_inc_sc_ex_rt, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(direct_ex_sc_inc_rt = replace(direct_ex_sc_inc_rt, caution1accuracyqualityorco == 1, NA)) %>%
  mutate(direct_ex_sc_ex_rt = replace(direct_ex_sc_ex_rt, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_income = replace(tax_income, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_res_income = replace(tax_res_income, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_nr_income = replace(tax_nr_income, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_indiv = replace(tax_indiv, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_corp = replace(tax_corp, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_res_corp = replace(tax_res_corp, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_nr_corp = replace(tax_nr_corp, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_payr_workf = replace(tax_payr_workf, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_property = replace(tax_property, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_indirect = replace(tax_indirect, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(res_indirect = replace(res_indirect, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(nr_indirect = replace(nr_indirect, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_g_s = replace(tax_g_s, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_gs_general = replace(tax_gs_general, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_gs_vat = replace(tax_gs_vat, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_gs_excises = replace(tax_gs_excises, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_trade = replace(tax_trade, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_trade_import = replace(tax_trade_import, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_trade_export = replace(tax_trade_export, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tax_other = replace(tax_other, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(nontax = replace(nontax, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(res_nontax = replace(res_nontax, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(nr_nontax = replace(nr_nontax, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(sc = replace(sc, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(grants = replace(grants, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(gst_rev = replace(gst_rev, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(trade = replace(trade, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(indirect = replace(indirect, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(corp = replace(corp, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(indiv = replace(indiv, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(direct = replace(direct, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(totnontax = replace(totnontax, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(tottax = replace(tottax, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(totrev = replace(totrev, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(income = replace(income, caution1accuracyqualityorco == 1, NA)) %>% 
  mutate(rev_inc_sc = replace(rev_inc_sc, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(rev_ex_sc = replace(rev_ex_sc, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(rev_ex_gr_inc_sc = replace(rev_ex_gr_inc_sc, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(rev_ex_gr_ex_sc = replace(rev_ex_gr_ex_sc, caution2resourcerevenuestax == 1, NA)) %>%
  mutate(tot_res_rev = replace(tot_res_rev, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tot_nres_rev_inc_sc = replace(tot_nres_rev_inc_sc, caution2resourcerevenuestax == 1, NA)) %>%
  mutate(tax_inc_sc = replace(tax_inc_sc, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_ex_sc = replace(tax_ex_sc, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(resourcetaxes = replace(resourcetaxes, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(nrtax_inc_sc = replace(nrtax_inc_sc, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(nrtax_ex_sc = replace(nrtax_ex_sc, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(direct_inc_sc_inc_rt = replace(direct_inc_sc_inc_rt, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(direct_inc_sc_ex_rt = replace(direct_inc_sc_ex_rt, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(direct_ex_sc_inc_rt = replace(direct_ex_sc_inc_rt, caution2resourcerevenuestax == 1, NA)) %>%
  mutate(direct_ex_sc_ex_rt = replace(direct_ex_sc_ex_rt, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_income = replace(tax_income, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_res_income = replace(tax_res_income, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_nr_income = replace(tax_nr_income, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_indiv = replace(tax_indiv, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_corp = replace(tax_corp, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_res_corp = replace(tax_res_corp, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_nr_corp = replace(tax_nr_corp, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_payr_workf = replace(tax_payr_workf, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_property = replace(tax_property, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_indirect = replace(tax_indirect, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(res_indirect = replace(res_indirect, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(nr_indirect = replace(nr_indirect, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_g_s = replace(tax_g_s, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_gs_general = replace(tax_gs_general, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_gs_vat = replace(tax_gs_vat, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_gs_excises = replace(tax_gs_excises, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_trade = replace(tax_trade, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_trade_import = replace(tax_trade_import, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_trade_export = replace(tax_trade_export, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tax_other = replace(tax_other, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(nontax = replace(nontax, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(res_nontax = replace(res_nontax, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(nr_nontax = replace(nr_nontax, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(sc = replace(sc, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(grants = replace(grants, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(gst_rev = replace(gst_rev, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(trade = replace(trade, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(indirect = replace(indirect, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(corp = replace(corp, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(indiv = replace(indiv, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(direct = replace(direct, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(totnontax = replace(totnontax, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(tottax = replace(tottax, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(totrev = replace(totrev, caution2resourcerevenuestax == 1, NA)) %>% 
  mutate(income = replace(income, caution2resourcerevenuestax == 1, NA))


df_ictd_clean <- df_ictd_taxfix_final %>%
  select(cname, 
         ccode, 
         year, 
         caution1accuracyqualityorco, 
         caution2resourcerevenuestax, 
         rev_inc_sc:grants, 
         gst_rev:income)

################################################################################
### University of Gothenburg Quality of Governance (QOG) Indicators
### Version January 2019 Time Series
################################################################################

# Quality of governance indicators
df_qog_raw <- read.dta13("../data/xnat/qog/qog_std_ts_jan19.dta")


#Drop QOG ccode to ensure continuity across datasets
df_qog_raw_noccode <- df_qog_raw %>%
  select(-ccode)

# Our dataset treats:
# 1. Germany as a continuation of West Germany (excluding East Germany)
# 2. Vietnam as a continuation of North Vietnam (excluding South Vietnam)
# 3. Yemen as a continuation of Republic of Yemen (excluding Yemen, P.R.)
# 4. Yugoslavia as modern "Serbia"
# 5. Czech Republic as a continuation of Czechoslovakia
# 6. Russia as a continuation of the USSR

df_qog_coufix <- df_qog_raw_noccode %>%
  filter(!(year > 1974 & cname == "Cyprus (-1974)")) %>%
  filter(!(year < 1975 & cname == "Cyprus (-1975)")) %>%
  filter(!(year > 1992 & cname == "Czechoslovakia")) %>%
  filter(!(year < 1993 & cname == "Czech Republic")) %>%
  filter(!(year < 1993 & cname == "Slovakia")) %>%
  filter(!(year > 1992 & cname == "Ethiopia (-1992)")) %>%
  filter(!(year < 1993 & cname == "Ethiopia (1993-)")) %>%
  filter(!(year > 1962 & cname == "France (-1962)")) %>%
  filter(!(year < 1963 & cname == "France (1963-)")) %>%
  filter(!(year < 1991 & cname == "Germany")) %>%
  filter(!(year > 1990 & cname == "Germany, West")) %>%
  filter(!(cname == "Germany, East")) %>%
  filter(!(year > 1965 & cname == "Malaysia (-1965)")) %>%
  filter(!(year < 1966 & cname == "Malaysia (1966-)")) %>%
  filter(!(year > 1970 & cname == "Pakistan (-1970)")) %>%
  filter(!(year < 1971 & cname == "Pakistan (1971-)")) %>%
  filter(!(year > 1991 & cname == "USSR")) %>%
  filter(!(year < 1992 & cname == "Russia")) %>%
  filter(!(year > 2011 & cname == "Sudan (-2011)")) %>%
  filter(!(year < 2012 & cname == "Sudan (2012-)")) %>%
  filter(!(cname == "Tibet")) %>%
  filter(!(cname == "Vietnam, South")) %>%
  filter(!(year > 1976 & cname == "Vietnam, North")) %>% 
  filter(!(year < 1977 & cname == "Vietnam")) %>%
  filter(!(year > 1989 & cname == "Yemen, North")) %>%
  filter(!(cname == "Yemen, South")) %>%
  filter(!(year < 1990 & cname == "Yemen")) %>%
  filter(!(year > 1991 & cname == "Yugoslavia")) %>%
  filter(!(year < 1992 & cname == "Serbia and Montenegro")) %>%
  filter(!(year > 2007 & cname == "Serbia and Montenegro")) %>%
  filter(!(year < 2008 & cname == "Serbia")) 

df_qog_counamefix <- df_qog_coufix  

df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("Cyprus (-1974)", 
                            "Cyprus (1975-)")] <- "Cyprus"

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "Czechoslovakia"] <- "Czech Republic"
  
df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("Ethiopia (-1992)", 
                            "Ethiopia (1993-)")] <- "Ethiopia"

df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("France (-1962)", 
                            "France (1963-)")] <- "France"

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "Germany, West"] <- "Germany"

df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("Malaysia (-1965)", 
                            "Malaysia (1966-)")] <- "Malaysia"

df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("Pakistan (-1970)", 
                            "Pakistan (1971-)")] <- "Pakistan"

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "USSR"] <- "Russia"


df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("Pakistan (-1970)", 
                            "Pakistan (1971-)")] <- "Pakistan"


df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("Sudan (-2011)", 
                            "Sudan (2012-)")] <- "Sudan"



df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("Yemen, North", 
                            "Yemen")] <- "Yemen, Rep."


df_qog_counamefix$cname[df_qog_counamefix$cname %in% 
                          c("Serbia and Montenegro", 
                            "Serbia")] <- "Yugoslavia"

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "Congo"] <- "Congo, Rep."

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "Congo, Democratic Republic"] <- "Congo, Dem. Rep."

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "Korea, North"] <- "North Korea"

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "Korea, South"] <- "South Korea"

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "St Vincent and the Grenadines"] <- "St. Vincent and the Grenadines"

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "St Lucia"] <- "St. Lucia"

df_qog_counamefix$cname[df_qog_counamefix$cname == 
                          "St Kitts and Nevis"] <- "St. Kitts and Nevis"


df_qog_ccode <- df_qog_counamefix

df_qog_ccode$ccode <- countrycode(df_qog_ccode$cname, 
                                 "country.name", 
                                 "cown")
df_qog_ccode$ccode[df_qog_ccode$cname == "Micronesia"] <- 987





df_qog_keepvars <- df_qog_ccode %>% 
  select(cname, 
         ccode, 
         year, 
         ajr_settmort, 
         al_ethnic, 
         al_language, 
         al_religion, 
         bci_bci, 
         ht_colonial, 
         ti_cpi, 
         lp_catho80, 
         lp_legor, 
         lp_muslim80, 
         lp_no_cpm80, 
         lp_protmg80, 
         al_ethnic, 
         al_language, 
         al_religion, 
         shec_se, 
         dpi_author, 
         dpi_system, 
         dpi_plurality, 
         dpi_pr, 
         dpi_cl, 
         wdi_gnicapatlcur, 
         vdem_corr, 
         aii_acc, 
         bci_bci, 
         p_polity2, 
         bti_gp, 
         ffp_ps, 
         icrg_qog, 
         iiag_acc, 
         ti_cpi, 
         vdem_execorr, 
         vdem_pubcorr, 
         wbgi_cce, 
         wel_coc, 
         bti_ffe, 
         cam_contest, 
         cpds_vt, 
         ess_trparl, 
         ess_trpart, 
         ess_trpolit, 
         iiag_par, 
         van_index, 
         van_part, 
         vdem_delibdem, 
         vdem_edcomp_thick, 
         vdem_excrptps, 
         vdem_gcrrpt, 
         vdem_libdem, 
         vdem_partip, 
         vdem_polyarchy)

df_qog_clean <- df_qog_keepvars %>%
  filter(year > 1959)

################################################################################
### Tax Introduction Database (TID) version May 2019
################################################################################

df_tid_raw <- read_delim("../data/xnat/tid/TID_selection_2019.csv", 
                       skip = 8, 
                       delim = "|")

df_tid_raw_rename <- df_tid_raw %>%
  rename(cname = country)
  
df_tid_cnamefix <- df_tid_raw_rename

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Lao PDR"] <- "Laos"
  
df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Malaysia (Federation of malay)"] <- "Malaysia" 

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "United Arab Emirates (UAE)"] <- "United Arab Emirates"  


df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Dem. Rep. Korea"] <- "North Korea"

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Rep. Korea"] <- "Korea, Rep."

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Republic of Vietnam"] <- "South Vietnam"

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Yemen" | df_tid_cnamefix$cname == 
                        "Rep. Yemen"] <- "Yemen, Rep."

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Estonia"] <- "Estonia (pre-WWII)"

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Estonia II"] <- "Estonia"

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Latvia"] <- "Latvia (pre-WWII)"

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Latvia II"] <- "Latvia"

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Lithuania"] <- "Lithuania (pre-WWII)"

df_tid_cnamefix$cname[df_tid_cnamefix$cname == 
                        "Lithuania II"] <- "Lithuania"

df_tid_ccode <- df_tid_cnamefix

df_tid_ccode$ccode <- countrycode(df_tid_ccode$cname, 
                                  "country.name", 
                                  "cown")
df_tid_ccode$ccode[df_tid_ccode$cname == 
                     "Serbia "] <- 345  

df_tid_ccode$ccode[df_tid_ccode$cname == 
                     "Micronesia"] <- 987  


df_tid_reorder <- df_tid_ccode %>%
  select(cname, 
         ccode, 
         year, 
         tax:status, 
         source_id:comment)

# Ensuring no countries are erroneously excluded
unique(df_tid_reorder$cname[is.na(df_tid_reorder$ccode)])

df_tid_countriessub <- df_tid_reorder %>%
  filter(!(is.na(ccode)))

df_tid_missingfix <- df_tid_countriessub

df_tid_missingfix$year[(df_tid_missingfix$status %in% 
                         c("missing", 
                           "introduced")) & 
                         is.na(df_tid_missingfix$year) == TRUE] <- 7777




df_tid_missingfix$year[df_tid_missingfix$status == 
                         "not introduced"] <- 9999

df_tid_missingfix$mode[df_tid_missingfix$mode == 
                         "missing"] <- ""

df_tid_missingfix$status[df_tid_missingfix$status == 
                           "missing"] <- ""

df_tid_tid_status <- df_tid_missingfix

df_tid_tid_status$tid_status <- if_else(df_tid_tid_status$status == 
                                          "introduced", 
                                        1, 
                                        0)

df_tid_tid_status$tid_status[df_tid_tid_status$status == 
                               ""] <- NA

df_tid_keepvars <- df_tid_tid_status %>%
  select(cname, 
         ccode, 
         tax, 
         year) %>%
  filter(!(ccode==679 & tax == "CIT" & year == 7777)) %>% 
  filter(!(ccode==679 & tax == "PIT" & year == 1968)) %>% 
  filter(!(ccode==679 & tax == "SSC" & year == 1987)) %>% 
  filter(!(ccode==679 & tax == "VAT" & year == 9999)) %>% 
  distinct()




# Reshaping to a wide set


df_tid_wide <- df_tid_keepvars %>% 
  spread(tax, 
         year) %>%
  filter(!cname == "Great Colombia") %>%
  filter(!(grepl("pre-WWII", cname))) %>%
  filter(!cname == "Yugoslavia")

df_tid_clean <- df_tid_wide[order(df_tid_wide$ccode), ]


################################################################################
### UCDP/PRIO Armed Conflict Dataset
### UCDP Dyadic Dataset version 19.1
################################################################################


df_prio_raw <- readRDS("../data/xnat/prio/Dyadic_v19_1.rds")

# Location refers to main government parties to each conflict, 
# not the geographic location of the conflict

# Splitting strings to generate cow codes for each country

df_prio_locsplit <- df_prio_raw %>%
  separate('location', 
           paste("location", 
                 1:6, 
                 sep = "_"), 
           sep = ",", 
           extra = "drop")

# Dropping unneeded variables
df_prio_varkeep <- df_prio_locsplit %>%
  select(location_1:location_6, 
         conflict_id, 
         side_b_id, 
         year, 
         type_of_conflict)

# Reshaping in order to merge on country-year

df_prio_long <- df_prio_varkeep %>%
  gather(location, 
         num, 
         -c(conflict_id, 
            side_b_id, 
            year, 
            type_of_conflict))

df_prio_long$location <- df_prio_long$num

df_prio_long_numdrop <- df_prio_long %>%
  select(-num)

df_prio_confind <- df_prio_long_numdrop

# Generating indicator variables based on conflict type

df_prio_confind$prio_extrasystemic_conflict <- 
  as.numeric(df_prio_confind$type_of_conflict == 1)

df_prio_confind$prio_interstate_conflict <- 
  as.numeric(df_prio_confind$type_of_conflict == 2)

df_prio_confind$prio_civil_wars <- 
  as.numeric(df_prio_confind$type_of_conflict == 3)

df_prio_confind$prio_civil_wars_intl <- 
  as.numeric(df_prio_confind$type_of_conflict == 4)

# Dropping blank locations from reshape and dropping unneeded vars

df_prio_filter <- df_prio_confind %>%
  filter(!is.na(location)) %>%
  select(conflict_id, 
         location, 
         year, 
         type_of_conflict, 
         prio_extrasystemic_conflict, 
         prio_interstate_conflict, 
         prio_civil_wars, 
         prio_civil_wars_intl)

df_prio_country <- df_prio_filter

# Adding COW country codes

df_prio_country$location[df_prio_country$location == 
                           "Hyderabad"] <- "India"

df_prio_country$location[df_prio_country$location == 
                           "Vietnam (North Vietnam)"] <- "Vietnam"

df_prio_country$location[df_prio_country$location == 
                           " Vietnam (North Vietnam)"] <- "Vietnam"

df_prio_country$location[df_prio_country$location == 
                           "South Vietnam"] <- "Vietnam"

df_prio_country$location[df_prio_country$location == 
                           "Yemen (North Yemen)"] <- "Yemen, Rep."

df_prio_country$location[df_prio_country$location == 
                           " Yemen (North Yemen)"] <- "Yemen, Rep."

df_prio_country$location[df_prio_country$location == 
                           "South Yemen"] <- "Yemen, Rep."

df_prio_country$ccode <- countrycode(df_prio_country$location, 
                                     "country.name", 
                                     "cown")

df_prio_country$ccode[df_prio_country$location == 
                        "FYR"] <- 343

df_prio_collapse <- df_prio_country %>%
  group_by(ccode, 
           year) %>%
  summarise(prio_extrasystemic_conflict = max(prio_extrasystemic_conflict), 
            prio_interstate_conflict = max(prio_interstate_conflict), 
            prio_civil_wars = max(prio_civil_wars), 
            prio_civil_wars_intl = max(prio_civil_wars_intl))

df_prio_order <- df_prio_collapse %>%
  select(c(ccode, 
           year, 
           prio_extrasystemic_conflict:prio_civil_wars_intl))

df_prio_order$cname <- countrycode(df_prio_order$ccode, 
                                   "cown", 
                                   "country.name")


df_prio_clean <- df_prio_order %>%
  select(cname, 
         ccode, 
         year, 
         prio_extrasystemic_conflict:prio_civil_wars_intl)

################################################################################
### WB Income Groups
################################################################################

df_wbinc_raw <- read_excel("../data/xnat/wdi/OGHIST.xlsx", 
                           sheet = "Country Analytical History", 
                           skip = 5)
# Drop non-relevant rows

df_wbinc_filter <- df_wbinc_raw %>%
  filter(!is.na(df_wbinc_raw$...1))

# Renaming variables
df_wbinc_rename <- df_wbinc_filter %>%
  rename(wbcoded = ...1, 
         countryname = "Data for calendar year :")

df_wbinc_rename_yrfix <- df_wbinc_rename %>%
  setNames(paste0("year_", names(.))) %>%
  rename(wbcoded = year_wbcoded, 
         countryname = year_countryname)


# Reshaping
df_wbinc_reshape <- df_wbinc_rename %>%
  gather(year_, 
         wdi_income, 
         -c(wbcoded, 
            countryname))

df_wbinc_reshape$wdi_income[df_wbinc_reshape$wdi_income == 
                              ".."] <- ""

df_wbinc_cntryfix <- df_wbinc_reshape

df_wbinc_cntryfix$year_ = as.numeric(df_wbinc_cntryfix$year_)

df_wbinc_cntryfix$wbcoded[df_wbinc_cntryfix$wbcoded == 
                            c("YUGf") & 
                            df_wbinc_cntryfix$year_ > 1986 & 
                            df_wbinc_cntryfix$year_ < 1992] <- "SRB"


df_wbinc_cntryfix$wbcoded[df_wbinc_cntryfix$wbcoded == "YUG" & 
                            df_wbinc_cntryfix$year_ > 1991 & 
                            df_wbinc_cntryfix$year_ < 2006] <- "SRB"

df_wbinc_serbfix <- df_wbinc_cntryfix %>%
  filter(!(countryname == "Serbia" & year_ > 1986 & year_ < 2006))

df_wbinc_serbfix$wbcoded[df_wbinc_serbfix$wbcoded == "SUN" & 
                            df_wbinc_serbfix$year_ == 1990] <- "RUS"

df_wbinc_rusfix <- df_wbinc_serbfix %>%
  filter(!(countryname == "Russian Federation" & year_ == 1990))

df_wbinc_rusfix$wbcoded[df_wbinc_rusfix$wbcoded == "CSK" & 
                            df_wbinc_rusfix$year_ %in% c(1990, 1991)] <- "CZE"

df_wbinc_czechfix <- df_wbinc_rusfix %>%
  filter(!(countryname == "Czech Republic" & year_ %in% c(1990, 1991)))


df_wbinc_czechfix$ccode <- countrycode(df_wbinc_czechfix$wbcoded, 
                                       "wb", 
                                       "cown")

unique(df_wbinc_czechfix$countryname[is.na(df_wbinc_czechfix$ccode)])

df_wbinc_ccodefix <- df_wbinc_czechfix

df_wbinc_ccodefix$ccode[df_wbinc_ccodefix$wbcoded == 
                          "SRB"] <- 345

df_wbinc_ccodefix$ccode[df_wbinc_ccodefix$countryname == 
                          "Aruba"] <- 533

df_wbinc_ccodefix$ccode[df_wbinc_ccodefix$countryname == 
                          "West Bank and Gaza"] <- 665


df_wbinc_cnamefix <- df_wbinc_ccodefix %>%
  select(-countryname) %>%
  rename(year = year_) %>%
  filter(!is.na(ccode))

df_wbinc_cnamefix$cname <- countrycode(df_wbinc_cnamefix$ccode, 
                                       "cown", 
                                       "country.name")

df_wbinc_cnamefix$cname[df_wbinc_cnamefix$ccode == 
                          c(345)] <- c("Serbia")

df_wbinc_cnamefix$cname[df_wbinc_cnamefix$ccode == 
                          c(533)] <- c("Aruba")

df_wbinc_cnamefix$cname[df_wbinc_cnamefix$ccode == 
                          c(665)] <- c("West Bank and Gaza")


df_wbinc_clean <- df_wbinc_cnamefix %>%
  select(c(cname, 
           ccode, 
           year, 
           wdi_income))

################################################################################
### World Bank Regions
################################################################################

df_wbreg_raw <- read_excel("../data/xnat/wdi/CLASS.xlsx", 
                           skip = 4)
df_wbreg_varkeep <- df_wbreg_raw %>%
  select(Code, 
         Region, 
         Economy) %>%
  rename(wbcoded = Code, 
         region = Region, 
         ctryname = Economy) %>%
  filter(!is.na(region)) %>%
  filter(!(region == "x"))

df_wbreg_varkeep$ccode <- countrycode(df_wbreg_varkeep$wbcoded, 
                                      "wb", 
                                      "cown")

unique(df_wbreg_varkeep$ctryname[is.na(df_wbreg_varkeep$ccode)])

df_wbreg_ctryfix <- df_wbreg_varkeep

df_wbreg_ctryfix$ccode[df_wbreg_ctryfix$ctryname == "Serbia"] <- 345
df_wbreg_ctryfix$ccode[df_wbreg_ctryfix$ctryname == "Aruba"] <- 533
df_wbreg_ctryfix$ccode[df_wbreg_ctryfix$ctryname == "West Bank and Gaza"] <- 665

df_wbreg_clean <- df_wbreg_ctryfix %>%
  filter(!is.na(ccode)) %>%
  select(ccode, 
         region)

################################################################################
### Polity V
################################################################################
df_polity_raw <- read_sav("../data/xnat/polity/p4v2018.sav")

df_polity_ctrfix <- df_polity_raw %>% 
  mutate(ccode = replace(ccode, 
                         scode == "KOS", 
                         347)) %>%
  mutate(ccode = replace(ccode, 
                         scode == "SER", 
                         345)) %>% 
  mutate(ccode = replace(ccode, 
                         scode == "YGS", 
                         345)) %>% 
  mutate(ccode = replace(ccode, 
                         scode == "SSU", 
                         626)) %>% 
  mutate(ccode = replace(ccode, 
                         scode == "VIE", 
                         816)) %>% 
  mutate(ccode = replace(ccode, 
                         scode == "GFR", 
                         255)) %>% 
  mutate(ccode = replace(ccode, 
                         scode == "ETI", 
                         530)) %>% 
  mutate(ccode = replace(ccode, 
                         scode == "CZE", 
                         316)) %>% 
  mutate(ccode = replace(ccode, 
                         scode %in% c("USR", "USSR"), 
                         365)) %>% 
  mutate(ccode = replace(ccode, 
                         scode == "PKS", 
                         770)) %>% 
  mutate(ccode = replace(ccode, 
                         scode == "YAR", 
                         679)) %>% 
  mutate(ccode = replace(ccode, 
                       scode == "SDN", 
                       625)) %>% 
  mutate(ccode = replace(ccode, 
                         country == "Montenegro", 
                         341)) %>% 
  filter(!(scode == "SDN" & year == 2011)) %>%
  filter(!(scode == "GFR" & year == 1945)) %>% 
  filter(!(scode == "GMY" & year == 1990)) %>% 
  filter(!(scode == "DRV" & year == 1976)) %>% 
  filter(!(scode == "RUS" & year == 1922)) %>% 
  filter(!(scode == "YEM" & year == 1990)) %>% 
  filter(!(scode == "ETI" & year == 1993)) %>%
  filter(!(ccode == 345 & year == 1991 & regtrans == 96)) %>%
  filter(!(ccode == 345 & year == 2006 & regtrans == 98)) %>%
  select(ccode, 
         year, 
         regtrans, 
         polity2)
  
df_polity_vargendem <- df_polity_ctrfix %>%
  mutate(dem_transition = ifelse((regtrans == 2 | 
                                    regtrans == 3), 
                                 1, 
                                 0)) %>% 
  mutate(dem_transition = replace(dem_transition, 
                                  is.na(regtrans), 
                                  0))
df_polity_vargenauth <- df_polity_vargendem %>%
  mutate(auth_transition = ifelse((regtrans == (-2)), 
         1, 
         0)) %>% 
  mutate(auth_transition = replace(auth_transition, 
                                  is.na(regtrans), 
                                  0))

df_polity_ctruniform <- df_polity_vargenauth
  
df_polity_ctruniform$cname <- countrycode(df_polity_ctruniform$ccode, 
                                          "cown", 
                                          "country.name")
df_polity_clean <- df_polity_ctruniform %>%
  filter(!is.na(cname)) 

################################################################################
### Accountability Data
################################################################################
load("../data/xnat/accountability/accountability.RData")
df_acc_raw <- accountability

df_acc_rename <- df_acc_raw %>% 
  mutate(country_year = paste(country_name, year, sep = "-"), 
         acc_index = Accountability, 
         acc_horiz = Horizontal, 
         acc_vert = Vertical, 
         acc_diag = Diagonal)

df_acc_rename$country_name = as.character(df_acc_rename$country_name)

df_acc_vnfix <- df_acc_rename %>% 
  mutate(country_name = replace(country_name, 
                                country_text_id == "VNM", 
                                "Vietnam"))

df_acc_vnfix$ccode <- (countrycode(df_acc_vnfix$country_name, 
                                    "country.name", 
                                    "cown"))
df_acc_ccodefix <- df_acc_vnfix %>% 
  mutate(ccode = replace(ccode, 
                         country_name == "Serbia", 
                         345)) %>% 
  mutate(ccode = replace(ccode, 
                         country_name == "S<e3>o Tom<e9> och Pr<ed>ncipe", 
                         403)) %>% 
  mutate(country_name = replace(country_name, 
                                country_name == "S<e3>o Tom<e9> och Pr<ed>ncipe", 
                                "Sao Tome and Principe"))

df_acc_clean <- df_acc_ccodefix %>% 
  select(year,
         acc_index,
         acc_horiz, 
         acc_vert,
         acc_diag,
         ccode)
################################################################################
### Merging Datasets
################################################################################
df_merged_initial <- df_wdidata_clean_final %>%
  full_join(df_ictd_clean, 
            by = c("ccode", 
                   "year")) %>%
  full_join(df_cow_clean, 
            by = "ccode") %>%
  full_join(df_vdem_clean, 
            by = c("ccode", 
                   "year")) %>%
  full_join(df_qog_clean, 
            by = c("ccode", 
                   "year")) %>%
  full_join(df_tid_clean, 
            by = "ccode") %>%
  full_join(df_prio_clean, 
            by = c("ccode", 
                   "year")) %>%
  full_join(df_wbinc_clean, 
            by = c("ccode", 
                   "year")) %>%
  full_join(df_wbreg_clean, 
            by = "ccode") %>%
  full_join(df_polity_clean, 
            by = c("ccode", 
                   "year")) %>% 
  full_join(df_acc_clean, 
            by = c("ccode", 
                   "year"))


df_merged_styearcheck <- df_merged_initial %>%
  mutate(before_cow = ifelse(styear > year & !is.na(styear), 1, 0)) %>%
  filter(!before_cow == 1) 

unique(df_merged_styearcheck$cname)
unique(df_merged_styearcheck$ccode)

df_merged_styearcheck <- df_merged_styearcheck %>% 
  filter(!is.na(ccode))


# Generating an indicator variable for tax introduction

df_merged_taxyear <- df_merged_styearcheck

df_merged_taxyear$PIT <- as.character(df_merged_taxyear$PIT)
df_merged_taxyear$VAT <- as.character(df_merged_taxyear$VAT)
df_merged_taxyear$CIT <- as.character(df_merged_taxyear$CIT)
df_merged_taxyear$GST <- as.character(df_merged_taxyear$GST)

df_merged_taxyearfix <- df_merged_taxyear %>%
  mutate(PIT_i = if_else(year >= PIT, "1", "0", )) %>%
  mutate(PIT_i = replace(PIT_i, 
                         PIT == "7777", 
                         ".a")) %>% 
  mutate(CIT_i = if_else(year >= CIT, "1", "0", )) %>%
  mutate(CIT_i = replace(CIT_i, 
                         CIT == "7777", 
                         ".a")) %>% 
  mutate(GST_i = if_else(year >= GST, "1", "0", )) %>%
  mutate(GST_i = replace(GST_i, 
                         GST == "7777", 
                         ".a")) %>%  
  mutate(VAT_i = if_else(year >= VAT, "1", "0", )) %>%
  mutate(VAT_i = replace(VAT_i, 
                         VAT == "7777", 
                         ".a")) %>% 
  mutate(SSC_i = if_else(year >= SSC, "1", "0", )) %>%
  mutate(SSC_i = replace(SSC_i, 
                         SSC == "7777", 
                         ".a")) %>% 
  mutate(INH_i = if_else(year >= INH, "1", "0", )) %>%
  mutate(INH_i = replace(INH_i, 
                         INH == "7777", 
                         ".a")) 

df_merged_taxyearfix$PIT[df_merged_taxyearfix$PIT == "7777"] <- ".a"
df_merged_taxyearfix$PIT[df_merged_taxyearfix$PIT == "9999"] <- ".b"

df_merged_taxyearfix$CIT[df_merged_taxyearfix$CIT == "7777"] <- ".a"
df_merged_taxyearfix$CIT[df_merged_taxyearfix$CIT == "9999"] <- ".b"

df_merged_taxyearfix$GST[df_merged_taxyearfix$GST == "7777"] <- ".a"
df_merged_taxyearfix$GST[df_merged_taxyearfix$GST == "9999"] <- ".b"

df_merged_taxyearfix$VAT[df_merged_taxyearfix$VAT == "7777"] <- ".a"
df_merged_taxyearfix$VAT[df_merged_taxyearfix$VAT == "9999"] <- ".b"

df_merged_taxyearfix$SSC[df_merged_taxyearfix$SSC == "7777"] <- ".a"
df_merged_taxyearfix$SSC[df_merged_taxyearfix$SSC == "9999"] <- ".b"

df_merged_taxyearfix$INH[df_merged_taxyearfix$INH == "7777"] <- ".a"
df_merged_taxyearfix$INH[df_merged_taxyearfix$INH == "9999"] <- ".b"


  

df_merged_conflictfix <- df_merged_taxyearfix %>%
  mutate(prio_extrasystemic_conflict = replace(prio_extrasystemic_conflict, 
                                               is.na(prio_extrasystemic_conflict), 
                                               0)) %>% 
  mutate(prio_interstate_conflict = replace(prio_interstate_conflict, 
                                               is.na(prio_interstate_conflict), 
                                               0)) %>% 
  mutate(prio_civil_wars = replace(prio_civil_wars, 
                                               is.na(prio_civil_wars), 
                                               0)) %>% 
  mutate(prio_civil_wars_intl = replace(prio_civil_wars_intl, 
                                               is.na(prio_civil_wars_intl), 
                                               0))

df_merged_addvars <- df_merged_conflictfix %>% 
  mutate(log_gdppc = log(wdip_gdp_ppc)) %>%
  mutate(any_election = if_else(((v2xel_elecparl == 1 | 
                                    v2xel_elecpres == 1)) & 
                                  !is.na(v2xel_elecparl), 
                                1, 0, )) %>% 
  mutate(any_election = replace(any_election, 
                                is.na(v2xel_elecparl), 
                                NA)) %>% 
  mutate(log_gdp_ppc_cons = log(wdip_gdp_ppc_cons)) %>% 
  mutate(log_wdi_gnicapatlcur = log(wdi_gnicapatlcur)) %>% 
  mutate(log_wdi_pop = log(wdip_pop_total)) %>% 
  mutate(log_wdi_gdp_cusd = log(wdip_gdp_cusd)) %>%
  mutate(log_wdi_ppp_con_i = log(wdip_gdp_ppp_cusd_i)) %>% 
  mutate(tottax_reliance = tottax/totrev) %>% 
  mutate(indirect_reliance = indirect/totrev) %>% 
  mutate(direct_reliance = direct/totrev) %>% 
  mutate(totnontax_reliance = totnontax/totrev) %>% 
  select(-cname) %>% 
  rename(cname = cname.x)

df_merged_cnamefix <- df_merged_addvars %>% 
  select(-c(cname, 
           cname.x.x, 
           cname.x.x.x, 
           cname.y, 
           cname.y.y, 
           cname.y.y.y, 
           cname.x.x.x.x, 
           cname.y.y.y.y))

df_merged_cnamefix$cname <- countrycode(df_merged_cnamefix$ccode, 
                            "cown", 
                            "country.name")

df_merge_ccodefix <- df_merged_cnamefix %>% 
  mutate(cname = replace(cname, 
                         ccode == 852, 
          "Hong Kong")) %>% 
  mutate(cname = replace(cname, 
                         ccode == 533, 
                         "Aruba")) %>% 
  mutate(cname = replace(cname, 
                         ccode == 665, 
                         "West Bank and Gaza")) %>%
  mutate(country_year = paste(cname, 
                              year, 
                              sep = "-")) %>% 
  select(cname, 
         country_year, 
         ccode, 
         year, 
         wdip_rain:totnontax_reliance) %>%
  filter(!is.na(year))

## Check that no countries have too few years

panel_info <- tapply(df_merge_ccodefix$year, df_merge_ccodefix$ccode, length)
(rm_countries <- unique(df_merge_ccodefix$cname[df_merge_ccodefix$ccode %in% 
                                       as.numeric(names(panel_info[panel_info == 1]))])) # this should be zero
df_merge_regime <- df_merge_ccodefix

df_merge_regime$regime <- ifelse(df_merge_regime$polity2 %in% c(-10:-6), 
                                 "Autocracy", 
                                 ifelse(df_merge_regime$polity2 %in% c(-5:5), 
                                        "Anocracy", 
                                        ifelse(df_merge_regime$polity2 %in% c(6:10), 
                                               "Democracy", NA)))

# Change this to be on 0-100 scale to match tax reliance measures
df_merge_regime$v2x_corr <- df_merge_regime$v2x_corr*100

df_merge_taxcomb <- df_merge_regime

## Combine resource and non-resource indirect taxes

df_merge_taxcomb$comb_indirect <- df_merge_taxcomb$nr_indirect + 
  df_merge_taxcomb$res_indirect

## Create polity interaction variables

df_merge_polityint <- df_merge_taxcomb %>% 
  mutate(indirect_x_polity = comb_indirect*polity2, 
         direct_x_polity =  direct_ex_sc_ex_rt*polity2, 
         indirect_x_polity_sq = indirect_x_polity^2, 
         direct_x_polity_sq = direct_x_polity^2)


## Change all corruption measures to be more = more corruption

df_merge_corruptfix <- df_merge_polityint

df_merge_corruptfix[, c("vdem_excrptps", "vdem_gcrrpt")] <- 
  -1*df_merge_corruptfix[, c("vdem_excrptps", "vdem_gcrrpt")]

# Note: these variables for some reason are coded as higher =  less corruption 
# in the QoG data as of the January 2019 release. Confirm this is true when replicating. 

## Create lag and lead variables

df_merge_corruptfix <- df_merge_corruptfix %>% 
  filter(!is.na(cname)) %>% 
  filter(year > 1978)

df_var <- read.csv("../data/xnat/other/vat_varlist_final_02122020.csv", stringsAsFactors = F)

use_vars <- c(df_var$var_name[!df_var$var_group %in% c("dv", "aux")
                            & !df_var$var_name %in% c("region", "ccode", 
                                                     "cname", "year")], 
             "comb_indirect", "polity2", 
             "indirect_x_polity", "indirect_x_polity_sq", 
             "direct_x_polity", "direct_x_polity_sq")

lg <- function(x)c(NA, x[1:(length(x) - 1)])

df_merge_lag <- df_merge_corruptfix[order(df_merge_corruptfix$ccode, df_merge_corruptfix$year), ]


for(i in 1:length(use_vars)){
  df_merge_lag[, paste("lag1_", 
                       use_vars[i], 
                       sep = "")] <- unlist(tapply(df_merge_lag[, use_vars[i]], 
                                                   df_merge_lag$ccode, 
                                                   lg))
}

df_merge_lag <- df_merge_lag %>% 
  filter(tag == 1) %>%
  filter(year > 1979) 



## Check missingness on these variables 

dep_vars <- c("v2xnp_regcorr", "v2x_partip", 
             "v2x_cspart", "v2csprtcpt", "v2x_corr")
(prop_miss <- lapply(df_merge_lag[, c(use_vars, dep_vars)], 
                    FUN = function(x){sum(is.na(x))/length(x)}))

## Check max and min years on core variables:

max_year <- lapply(df_merge_lag[, df_var$var_name[df_var$var_group %in% c("dv", "key", "revenue")]], 
                  FUN = function(x){max(df_merge_lag$year[!is.na(x)])}) 
min_year <- lapply(df_merge_lag[, df_var$var_name[df_var$var_group %in% c("dv", "key", "revenue")]], 
                  FUN = function(x){min(df_merge_lag$year[!is.na(x)])}) # these should all be zero in imputed data
paste(c(max(unlist(min_year)), min(unlist(max_year))))
if(any(min_year != 1980)){stop("At least one key variable starts after 1980")}

## Get average number of countries per year on core variables

avg_n <- lapply(df_merge_lag[, df_var$var_name[df_var$var_group %in% c("key", "revenue")]], 
               FUN = function(x){mean(tapply(x, df_merge_lag$year, FUN = function(x){length(!is.na(x))}), 
                                      na.rm = T)}) 
min_n <- lapply(df_merge_lag[, df_var$var_name[df_var$var_group %in% c("key", "revenue")]], 
               FUN = function(x){min(tapply(x, df_merge_lag$year, FUN = function(x){length(!is.na(x))}), 
                                     na.rm = T)}) 

## Create squared terms (can't use poly because model won't run)
df_merge_yearsq <- df_merge_lag
df_merge_yearsq$year_sq <- df_merge_yearsq$year^2
df_merge_yearsq$year_cu <- df_merge_yearsq$year^3

## Do any rescaling/variable gen
df_merge_vdemrescale <- df_merge_yearsq
df_merge_vdemrescale$v2x_corr_orig <- df_merge_vdemrescale$v2x_corr # Save the original one
df_merge_vdemrescale$v2x_corr <- scale(-1*df_merge_vdemrescale$v2x_corr_orig) # This makes it so that higher = less corrupt, to match accountability measure
df_merge_vdemrescale$v2xnp_regcorr <- df_merge_vdemrescale$v2xnp_regcorr*100 # Make 0-100 scale to match v2x_corr

dfMerge <- df_merge_vdemrescale

propMiss <- prop_miss

dfVar <- df_var

dfCand <- read.csv("../data/xnat/other/candidate_variables_10Nov2020.csv", stringsAsFactors = F)


save(list = c("dfMerge", "propMiss", "dfVar", "dfCand"), 
     file = "../data/cleaned/vat_panel_cleaned15Nov.RData")

